In [1]:
# Nimish Bendre BU ID : U18700022
# CS 677 - Term Project - Likes analysis for Spotify
# Importing necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings
import missingno as msno
import seaborn as sns
from prettytable import PrettyTable
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import plotly.express as px
from sklearn.feature_selection import SelectFromModel
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix, accuracy_score
import sklearn.metrics as metrics
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
In [3]:
# Ignore warning messages
warnings.filterwarnings("ignore")

# Load spotify dataset
df_sy = pd.read_csv("C:\\BU\\Spotify_Youtube.csv")
# Add the Review column to act as class attribute and set the values. <300K = 'Good', >300K and < 1M is "Great'
# and >1M is 'Awesome'
df_sy['Review'] = ['Good' if likes < 300000 else 'Great' if likes < 1000000 else 'Awesome' for likes in df_sy['Likes']]
corr_matrix = df_sy.corr()
sns.heatmap(corr_matrix, annot=True)
plt.show()
In [4]:
# Interesting insights about the Spotify dataset
# Top 5 artists by likes and views
grouped_likes_views = df_sy.groupby('Artist').agg({'Likes': 'sum', 'Views': 'sum'})
# Sort the results by likes and views, and select the top 5 artists
top_artists_by_likes = grouped_likes_views.sort_values('Likes', ascending=False).head(5)
top_artists_by_views = grouped_likes_views.sort_values('Views', ascending=False).head(5)
# Plot the top 5 artists by likes and views using a bar chart
fig, axs = plt.subplots(1, 2, figsize=(10, 5))
# Plot the top 5 artists by likes
axs[0].bar(top_artists_by_likes.index, top_artists_by_likes['Likes'], color='#19ACCE')
axs[0].set_title('Top 5 Artists by Likes')
axs[0].set_xlabel('Artist')
axs[0].set_ylabel('Total Likes')
# Plot the top 5 artists by views
axs[1].bar(top_artists_by_views.index, top_artists_by_views['Views'], color='#146C94')
axs[1].set_title('Top 5 Artists by Views')
axs[1].set_xlabel('Artist')
axs[1].set_ylabel('Total Views')
plt.tight_layout()
plt.show()
In [5]:
# Top 5 Albums by likes and views
grouped_likes_views = df_sy.groupby('Album').agg({'Likes': 'sum', 'Views': 'sum'})
# Sort the results by likes and views, and select the top 5 albums
top_albums_by_likes = grouped_likes_views.sort_values('Likes', ascending=False).head(5)
top_albums_by_views = grouped_likes_views.sort_values('Views', ascending=False).head(5)
# Plot the top 5 albums by likes and views using a bar chart
fig, axs = plt.subplots(1, figsize=(10, 5))
# Plot the top 5 albums by likes
axs.bar(top_albums_by_likes.index, top_albums_by_likes['Likes'], color='#643A6B')
# plt.xticks(top_albums_by_likes.index, rotation=0, ha='center', wrap=True)
axs.set_title('Top 5 Albums by Likes')
axs.set_xlabel('Album')
axs.set_ylabel('Total Likes')
plt.show()
# Plot the top 5 albums by views
fig, axs1 = plt.subplots(1, figsize=(10, 5))
axs1.bar(top_albums_by_views.index, top_albums_by_views['Views'], color='#917FB3')
# plt.xticks(top_albums_by_views.index, rotation=0, ha='center', wrap=True)
axs1.set_title('Top 5 Albums by Views')
axs1.set_xlabel('Album')
axs1.set_ylabel('Total Views')
plt.tight_layout()
plt.show()
In [6]:
# Top 5 Danceable Albums
# Group the data by album and calculate the mean danceability score for each album
grouped = df_sy.groupby('Album').agg({'Danceability': 'mean'})
# Sort the results by danceability score, and select the top 5 albums
top_albums = grouped.sort_values('Danceability', ascending=False).head(5)
# Plot bar chart of the top 5 danceable albums
fig, ax = plt.subplots(figsize=(10, 5))
ax.set_xlim(0.9, 1)
plt.yticks(wrap=True)
ax.barh(top_albums.index, top_albums['Danceability'], color='#5F264A')
ax.set_title('Top 5 Danceable Albums')
ax.set_xlabel('Danceability')
ax.set_ylabel('Album')
plt.show()
In [7]:
# Count of official and unofficial Youtube videos
official_video_counts = df_sy['official_video'].value_counts()
# Create a bar plot of official video counts
fig, ax = plt.subplots()
official_video_counts.plot(kind='bar', ax=ax, color=['#1f77b4', '#ff7f0e'])
ax.set_title('Official Videos')
ax.set_xlabel('Official Video')
ax.set_ylabel('Count')
ax.set_xticklabels(['No', 'Yes'], rotation=0)
plt.show()
In [8]:
# Select the columns you want to calculate the correlation with
feature_columns = ['Danceability', 'Loudness', 'Speechiness', 'Duration_ms', 'Channel', 'Energy']
# Calculate the correlation matrix
corr_matrix = df_sy[feature_columns + ['Review']].corr()
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', vmin=-1, vmax=1)
plt.show()
In [9]:
# Data cleaning
# Removing unwanted columns from the dataset
df_sy.drop(['Url_spotify', 'Uri', 'Url_youtube', 'Title', 'Views', 'Likes', 'Comments', 'Description', 'Stream'],
           axis=1, inplace=True)
# Removing the unnamed column from dataset
df_sy = df_sy.drop(df_sy.columns[df_sy.columns.str.contains('Unnamed', case=False)], axis=1)
# Removing the rows with NaN values
df_sy = df_sy.dropna()
# Checking for missing values in dataset
msno.bar(df_sy)
plt.show()
In [10]:
# To enumerate labels into values for enumerated columns in the dataset
# Could not optimize this code due to time constraint
le = LabelEncoder()
df_sy['Artist'] = le.fit_transform(df_sy.Artist.values)
df_sy['Track'] = le.fit_transform(df_sy.Track.values)
df_sy['Album'] = le.fit_transform(df_sy.Album.values)
df_sy['Album_type'] = le.fit_transform(df_sy.Album_type.values)
df_sy['Danceability'] = le.fit_transform(df_sy.Danceability.values)
df_sy['Energy'] = le.fit_transform(df_sy.Energy.values)
df_sy['Key'] = le.fit_transform(df_sy.Key.values)
df_sy['Loudness'] = le.fit_transform(df_sy.Loudness.values)
df_sy['Speechiness'] = le.fit_transform(df_sy.Speechiness.values)
df_sy['Acousticness'] = le.fit_transform(df_sy.Acousticness.values)
df_sy['Instrumentalness'] = le.fit_transform(df_sy.Instrumentalness.values)
df_sy['Liveness'] = le.fit_transform(df_sy.Liveness.values)
df_sy['Valence'] = le.fit_transform(df_sy.Valence.values)
df_sy['Tempo'] = le.fit_transform(df_sy.Tempo.values)
df_sy['Duration_ms'] = le.fit_transform(df_sy.Duration_ms.values)
df_sy['Channel'] = le.fit_transform(df_sy.Channel.values)
df_sy['Licensed'] = le.fit_transform(df_sy.Licensed.values)
df_sy['official_video'] = le.fit_transform(df_sy.official_video.values)
df_sy['Review'] = le.fit_transform(df_sy.Review.values)
# -------------------------------------------------------------------------------------------
In [11]:
# Get the feature dataframe
x_sy_data = df_sy.iloc[:, :-1]
# Get the class dataframe
y_sy_data = df_sy.iloc[:, -1]
# Feature selection algorithm - Extra trees classifier

clf = ExtraTreesClassifier(n_estimators=200, random_state=4)
clf = clf.fit(x_sy_data, y_sy_data)
coef = pd.concat([pd.DataFrame(x_sy_data.columns), pd.DataFrame(np.transpose(clf.feature_importances_))], axis=1)
# print(coef)
coef.columns = ['Feature', 'Importance']
coef.sort_values(by='Importance', inplace=True)
fig = px.bar(coef, x="Feature", y="Importance", title='Feature Importance from Extra Trees classifier',
             color='Importance')
fig.show()
In [12]:
# Analysis with the top importance features. Features with coefficient >= 0.6 are taken for analysis
model = SelectFromModel(clf, prefit=True)
feature_idx = model.get_support()
feature_name = x_sy_data.columns[feature_idx]
print('\n', len(feature_name), ' Features selected based on importance - Extra Trees Classifer :', feature_name)
x_sy_new_impfeature = df_sy[list(feature_name)]
 14  Features selected based on importance - Extra Trees Classifer : Index(['Artist', 'Track', 'Album', 'Danceability', 'Energy', 'Key', 'Loudness',
       'Speechiness', 'Acousticness', 'Liveness', 'Valence', 'Tempo',
       'Duration_ms', 'Channel'],
      dtype='object')
In [13]:
# Split the dataset into train and test dataset 60% train and 40% test
# Select class attribute for dataset
y_sy_new_class = df_sy.iloc[:, -1]
# Split the dataset into train and test
x_sy_train_new, x_sy_test_new, y_sy_train_new, y_sy_test_new = train_test_split(x_sy_new_impfeature,
                                                                                y_sy_new_class, test_size=0.6)
In [18]:
# Using K-NN algorithm for prediction and accuracy calculation
# Building the model
K = []
training = []
test = []
scores = {}
cv_scores = []
k_range = range(3, 13, 2)

for k in k_range:
    kn_clf = KNeighborsClassifier(n_neighbors=k)
    kn_clf.fit(x_sy_train_new, y_sy_train_new)
    y_pred = kn_clf.predict(x_sy_test_new)
    training_score = kn_clf.score(x_sy_train_new, y_sy_train_new)
    test_score = kn_clf.score(x_sy_test_new, y_sy_test_new)
    K.append(k)

    training.append(training_score)
    test.append(test_score)
    # scores[k] = [training_score, test_score]

    scores = cross_val_score(kn_clf, x_sy_train_new, y_sy_train_new, cv=10, scoring='accuracy')
    cv_scores.append(scores.mean())

# Find the best value of k
best_k = k_range[np.argmax(cv_scores)]
print('Best value of k:', best_k)
# Use optimum value of k to calculate performance measures
kn_clf = KNeighborsClassifier(n_neighbors=best_k)
kn_clf.fit(x_sy_train_new, y_sy_train_new)
y_pred_knn = kn_clf.predict(x_sy_test_new)
accuracy_knn = accuracy_score(y_sy_test_new, y_pred_knn) * 100
print("\nAccuracy with K-NN classifier is : ", round(accuracy_knn, 3))
confusion_matrix = metrics.confusion_matrix(y_sy_test_new, y_pred_knn)
print("\nConfusion Matrix - K-NN\n", confusion_matrix)
Best value of k: 11

Accuracy with K-NN classifier is :  63.19

Confusion Matrix - K-NN
 [[ 240 1632   70]
 [ 378 7318  252]
 [ 159 1981  119]]
In [20]:
# Use Gaussian Support vector machine (SVM) Classifier for predictions
svc_clf = SVC(kernel='rbf')
# Train the model
svc_clf.fit(x_sy_train_new, y_sy_train_new)
# Predict the response from dataset
y_pred_svm = svc_clf.predict(x_sy_test_new)
# Calculate model accuracy
accuracy_g_svm = round(metrics.accuracy_score(y_sy_test_new, y_pred_svm) * 100, 2)
# Compute accuracy
print("\nAccuracy with Gaussian SVM classifier is : ", accuracy_g_svm)
confusion_matrix = metrics.confusion_matrix(y_sy_test_new, y_pred_svm)
print("\nConfusion Matrix - SVM Gaussian \n", confusion_matrix)
Accuracy with Gaussian SVM classifier is :  65.42

Confusion Matrix - SVM Gaussian 
 [[   0 1942    0]
 [   0 7948    0]
 [   0 2259    0]]
In [22]:
# Use polynomial Support vector machine (SVM) Classifier with degree 2
svc_p_clf = SVC(kernel='poly', degree=2)
# Train the model
svc_p_clf.fit(x_sy_train_new, y_sy_train_new)
# Predict the response from dataset
y_pred_p_svm = svc_p_clf.predict(x_sy_test_new)
# Calculate model accuracy
accuracy_p_svm_score = metrics.accuracy_score(y_sy_test_new, y_pred_p_svm) * 100
# Compute accuracy with polynomial SVM degree 2
print("\nAccuracy with polynomial SVM classifier is : ", round(accuracy_p_svm_score,2))
confusion_matrix = metrics.confusion_matrix(y_sy_test_new, y_pred_p_svm)
print("\nConfusion Matrix - SVM Polynomial\n", confusion_matrix)

# Use Linear Support vector machine (SVM) Classifier for predictions
Accuracy with polynomial SVM classifier is :  65.42

Confusion Matrix - SVM Polynomial
 [[   0 1942    0]
 [   0 7948    0]
 [   0 2259    0]]
In [23]:
# Use Linear Support vector machine (SVM) Classifier for predictions
svc_lin_clf = LinearSVC()
# Train the model
svc_lin_clf.fit(x_sy_train_new, y_sy_train_new)
# Predict the response from dataset
y_pred_lin_svm = svc_lin_clf.predict(x_sy_test_new)
# Calculate model accuracy
accuracy_lin_svm_score = metrics.accuracy_score(y_sy_test_new, y_pred_lin_svm) * 100
# Compute accuracy
print("\nAccuracy with Linear SVM classifier is : ", round(accuracy_lin_svm_score, 2))
confusion_matrix = metrics.confusion_matrix(y_sy_test_new, y_pred_lin_svm)
print("\nConfusion Matrix - Linear SVM \n", confusion_matrix)
Accuracy with Linear SVM classifier is :  63.17

Confusion Matrix - Linear SVM 
 [[  25 1694  223]
 [  45 7491  412]
 [  19 2082  158]]
In [25]:
# Use Random Forest Classifier for n = 1 to 10 and depth = 1 to 5
accuracy_score_arr = []
n_tree = []
d_depth = []
df_result = pd.DataFrame()
n = 11
for i in range(1, n):
    d = 6
    for j in range(1, d):
        rfc = RandomForestClassifier(n_estimators=i, max_depth=j)
        rfc.fit(x_sy_train_new, y_sy_train_new)
        # Prediction
        y_prob_rfc = rfc.predict(x_sy_test_new)
        # Accuracy calculation
        accuracy_all_score = accuracy_score(y_sy_test_new, y_prob_rfc) * 100
        n_tree.append(i)
        d_depth.append(j)
        accuracy_score_arr.append(round(accuracy_all_score, 3))

df_result['N value'] = n_tree
df_result["Depth"] = d_depth
df_result["Accuracy Score"] = accuracy_score_arr
# print(df_result)
# Find the n and depth for maximum accuracy
accuracy_score_arr = df_result.max()
accuracy_rf = round(accuracy_score_arr['Accuracy Score'], 2)
print("\n Random Forest Classifier - Maximum accuracy is obtained for:\n", accuracy_score_arr)
confusion_matrix = metrics.confusion_matrix(y_sy_test_new, y_prob_rfc)
print("\n Confusion Matrix - Random Forest\n", confusion_matrix)
 Random Forest Classifier - Maximum accuracy is obtained for:
 N value           10.000
Depth              5.000
Accuracy Score    65.421
dtype: float64

 Confusion Matrix - Random Forest
 [[   0 1942    0]
 [   0 7948    0]
 [   0 2259    0]]
In [26]:
# Use Decision Tree Classifier
dtc = DecisionTreeClassifier(criterion="gini", random_state=100, max_depth=3, min_samples_leaf=5)
# Train the model
dtc.fit(x_sy_train_new, y_sy_train_new)
# Prediction
y_prob_dtc = dtc.predict(x_sy_test_new)
# Accuracy calculation
accuracy_dt = accuracy_score(y_sy_test_new, y_prob_dtc) * 100
# 1 - Compute accuracy for year 2
print("\nAccuracy with Decision Tree classifier is : ", round(accuracy_dt, 2))
confusion_matrix = metrics.confusion_matrix(y_sy_test_new, y_prob_dtc)
print("\nConfusion Matrix - Decision Tree classifier\n", confusion_matrix)
Accuracy with Decision Tree classifier is :  65.34

Confusion Matrix - Decision Tree classifier
 [[   0 1940    2]
 [   0 7936   12]
 [   0 2257    2]]
In [27]:
# -------------AdaBoost------------------
# For λ = 0.5 and λ = 1, constructed an Adaboost classifier with two base estimators of logistic regression
# and Naive Bayesian.
# Initialize base estimators
lr_clf = LogisticRegression()
nb_clf = GaussianNB()
classifiers = [lr_clf, nb_clf]
lambdas = [0.5, 1]
N = list(range(1, 16))
for lam in lambdas:

    for clf in classifiers:
        clf_name = type(clf).__name__
        error_rates = []
        for n in N:
            # Train Adaboost classifier with n weak learners and base estimator
            ada_clf = AdaBoostClassifier(base_estimator=clf, n_estimators=n, learning_rate=lam)
            ada_clf.fit(x_sy_train_new, y_sy_train_new)
            # Predict on year 2 data and calculate error rate
            y_pred_ada = ada_clf.predict(x_sy_test_new)
            error_rate = 1 - accuracy_score(y_sy_test_new, y_pred_ada)
            error_rates.append(error_rate)

        # Plot the error rates
        plt.plot(N, error_rates, label=clf_name)
        plt.xlabel('N ')
        plt.ylabel('Error rate')
        plt.title('Adaboost error rate for ' + clf_name + ' with λ= ' + str(lam))
        plt.legend()
        # plt.show()

# Best value of N* for each base estimator for lambda = 0.5
best_Ns = {}
best_accuracy = {}
for base_estimator in classifiers:
    error_rates = []
    accuracy_list = []
    # Iterate over the range of N values
    for n in N:
        ada_clf = AdaBoostClassifier(base_estimator=base_estimator, n_estimators=n, learning_rate=0.5)
        ada_clf.fit(x_sy_train_new, y_sy_train_new)
        y_pred_ada_clf = ada_clf.predict(x_sy_test_new)
        accuracy_ada_clf = accuracy_score(y_sy_test_new, y_pred_ada_clf)
        accuracy_list.append(accuracy_ada_clf)
        error_rate_ada_clf = round(1 - accuracy_ada_clf, 2)
        error_rates.append(error_rate_ada_clf)
    # Find the index of the minimum error rate
    best_N_idx = np.argmin(error_rates)
    best_Ns[str(base_estimator)] = N[best_N_idx]
    best_accuracy[str(base_estimator)] = accuracy_list[best_N_idx]
# Print the best N* for each base estimator
for estimator, best_N in best_Ns.items():
    estimator = estimator.replace("(", " ").replace(")", " ")
    # print('\nBest value of N* for',estimator,'is : ', best_N)
# print(best_accuracy)
accuracy_gnb_adab = round(best_accuracy['GaussianNB()'], 2) * 100
accuracy_lr_adab = round(best_accuracy['LogisticRegression()'], 2) * 100
# Accuracy for each base estimator
for estimator, best_acc in best_accuracy.items():
    estimator = estimator.replace("(", " ").replace(")", " ")
    print('\nAccuracy with AdaBoost classifier and ', estimator, 'is : ', round(best_acc, 2))
    
Accuracy with AdaBoost classifier and  LogisticRegression   is :  0.65

Accuracy with AdaBoost classifier and  GaussianNB   is :  0.66
In [28]:
# Logistic regression model
lrm = LogisticRegression()
lrm.fit(x_sy_train_new, y_sy_train_new)
# Predict
y_pred_lr = lrm.predict(x_sy_test_new)
# Logistic regression co-efficients
print("Logistic regression coefficients:")
print("b0 = ", lrm.intercept_)
print("b1 = ", lrm.coef_[0][0])
print("b2 = ", lrm.coef_[0][1])
# Accuracy and Confusion Matrix
accuracy_log_reg = round(accuracy_score(y_sy_test_new, y_pred_lr), 2) * 100
print("\nAccuracy with Logistic Regression classifier is :", accuracy_log_reg)
cm = metrics.confusion_matrix(y_sy_test_new, y_pred_lr)
print("\nConfusion Matrix - Logistic Regression classifier : \n", cm)
Logistic regression coefficients:
b0 =  [-1.20383600e-06  2.13776038e-06 -9.33924382e-07]
b1 =  -7.337901579735785e-05
b2 =  -6.4643313801626185e-06

Accuracy with Logistic Regression classifier is : 65.0

Confusion Matrix - Logistic Regression classifier : 
 [[  88 1854    0]
 [  91 7853    4]
 [  59 2200    0]]
In [31]:
# Tabular demonstration of accuracy and classifiers
# Summarize the table
print('\nSummary Table with features by importance using \nExtra Trees Classifier with importance > 0.6')
columns = ['Classifier', 'Accuracy %']
hdr = ['K-NN', 'SVM-Gaussian', 'SVM-Polynomial', 'SVM-Linear', 'Random Forest', 'Decision Tree',
       'AdaBoost with LogisticRegression', 'AdaBoost with Gaussian Naive Bayes', 'LogisticRegression']
c1 = [round(accuracy_knn, 2), accuracy_g_svm, round(accuracy_p_svm_score, 2), round(accuracy_lin_svm_score, 2)
    , accuracy_rf, round(accuracy_dt, 2), accuracy_lr_adab, accuracy_gnb_adab, accuracy_log_reg]
class_Table = PrettyTable()
class_Table.add_column(columns[0], hdr)
class_Table.add_column(columns[1], c1)
print(class_Table)
Summary Table with features by importance using 
Extra Trees Classifier with importance > 0.6
+------------------------------------+------------+
|             Classifier             | Accuracy % |
+------------------------------------+------------+
|                K-NN                |   63.19    |
|            SVM-Gaussian            |   65.42    |
|           SVM-Polynomial           |   65.42    |
|             SVM-Linear             |   63.17    |
|           Random Forest            |   65.42    |
|           Decision Tree            |   65.34    |
|  AdaBoost with LogisticRegression  |    65.0    |
| AdaBoost with Gaussian Naive Bayes |    66.0    |
|         LogisticRegression         |    65.0    |
+------------------------------------+------------+
In [32]:
# ----------- Code Start - Dataset with Top 4 features by importance using Extra Trees Classifier --------------
# Take the top 4 features by importance for the analysis
x_sy_top4 = df_sy[['Loudness','Duration_ms','Danceability','Speechiness']]
# print(x_sy_top4.head(10))
# Split the dataset into train and test dataset 60% train and 40% test
# Select class attribute for dataset
y_sy_top4 = df_sy.iloc[:, -1]
# Split the dataset into train and test
x_sy_train_top4, x_sy_test_top4, y_sy_train_top4, y_sy_test_top4 = train_test_split(x_sy_top4,
                                                                                y_sy_top4, test_size=0.6)
In [33]:
# Using K-NN algorithm for prediction and accuracy calculation
# Building the model
K = []
training = []
test = []
scores = {}
cv_scores = []
k_range = range(3,13,2)

for k in k_range:
    kn_clf = KNeighborsClassifier(n_neighbors=k)
    kn_clf.fit(x_sy_train_top4, y_sy_train_top4)
    y_pred = kn_clf.predict(x_sy_test_top4)
    training_score = kn_clf.score(x_sy_train_top4, y_sy_train_top4)
    test_score = kn_clf.score(x_sy_test_top4, y_sy_test_top4)
    K.append(k)

    training.append(training_score)
    test.append(test_score)
    # scores[k] = [training_score, test_score]

    scores = cross_val_score(kn_clf, x_sy_train_top4, y_sy_train_top4, cv=10, scoring='accuracy')
    cv_scores.append(scores.mean())

# Find the best value of k
best_k = k_range[np.argmax(cv_scores)]
print('Best value of k:', best_k)
# Use optimum value of k to calculate performance measures
kn_clf = KNeighborsClassifier(n_neighbors=best_k)
kn_clf.fit(x_sy_train_top4, y_sy_train_top4)
y_pred_knn = kn_clf.predict(x_sy_test_top4)
accuracy_knn = round(accuracy_score(y_sy_test_top4,y_pred_knn) * 100,2)
print("\nAccuracy with K-NN classifier is : ", accuracy_knn)
confusion_matrix = metrics.confusion_matrix(y_sy_test_top4, y_pred_knn)
print("\nConfusion Matrix - K-NN\n",confusion_matrix)
Best value of k: 11

Accuracy with K-NN classifier is :  63.03

Confusion Matrix - K-NN
 [[ 204 1634   95]
 [ 405 7340  256]
 [ 153 1948  114]]
In [34]:
# Use Gaussian Support vector machine (SVM) Classifier for predictions
svc_clf = SVC(kernel='rbf')
# Train the model
svc_clf.fit(x_sy_train_top4, y_sy_train_top4)
# Predict the response from dataset
y_pred_svm = svc_clf.predict(x_sy_test_top4)
# Calculate model accuracy
accuracy_g_svm = round(metrics.accuracy_score(y_sy_test_top4, y_pred_svm) * 100,2)
# Compute accuracy
print("\nAccuracy with Gaussian SVM classifier is : ", accuracy_g_svm)
confusion_matrix = metrics.confusion_matrix(y_sy_test_top4, y_pred_svm)
print("\nConfusion Matrix - SVM Gaussian \n",confusion_matrix)
Accuracy with Gaussian SVM classifier is :  65.86

Confusion Matrix - SVM Gaussian 
 [[   0 1933    0]
 [   0 8001    0]
 [   0 2215    0]]
In [35]:
# Use polynomial Support vector machine (SVM) Classifier with degree 2
svc_p_clf = SVC(kernel='poly', degree=2)
# Train the model
svc_p_clf.fit(x_sy_train_top4, y_sy_train_top4)
# Predict the response from dataset
y_pred_p_svm = svc_p_clf.predict(x_sy_test_top4)
# Calculate model accuracy
accuracy_p_svm_score = metrics.accuracy_score(y_sy_test_top4, y_pred_p_svm) * 100
# Compute accuracy with polynomial SVM degree 2
print("\nAccuracy with polynomial SVM classifier is : ", round(accuracy_p_svm_score, 2))
confusion_matrix = metrics.confusion_matrix(y_sy_test_top4, y_pred_p_svm)
print("\n Confusion Matrix - SVM Polynomial\n",confusion_matrix)
Accuracy with polynomial SVM classifier is :  65.86

 Confusion Matrix - SVM Polynomial
 [[   0 1933    0]
 [   0 8001    0]
 [   0 2215    0]]
In [36]:
# Use Linear Support vector machine (SVM) Classifier for predictions
svc_lin_clf = LinearSVC()
# Train the model
svc_lin_clf.fit(x_sy_train_top4, y_sy_train_top4)
# Predict the response from dataset
y_pred_lin_svm = svc_lin_clf.predict(x_sy_test_top4)
# Calculate model accuracy
accuracy_lin_svm_score = metrics.accuracy_score(y_sy_test_top4, y_pred_lin_svm) * 100
# Compute accuracy
print("\nAccuracy with Linear SVM classifier is : ", round(accuracy_lin_svm_score, 2))
confusion_matrix = metrics.confusion_matrix(y_sy_test_top4, y_pred_lin_svm)
print("\nConfusion Matrix - Linear SVM \n",confusion_matrix)
Accuracy with Linear SVM classifier is :  46.33

Confusion Matrix - Linear SVM 
 [[ 739 1169   25]
 [2827 4843  331]
 [ 907 1261   47]]
In [38]:
# Use Random Forest Classifier for n = 1 to 10 and depth = 1 to 5
accuracy_score_arr = []
n_tree = []
d_depth = []
df_result = pd.DataFrame()
n = 11
for i in range(1, n):
    d = 6
    for j in range(1, d):
        rfc = RandomForestClassifier(n_estimators=i, max_depth=j)
        rfc.fit(x_sy_train_top4, y_sy_train_top4)
        # Prediction
        y_prob_rfc = rfc.predict(x_sy_test_top4)
        # Accuracy calculation
        accuracy_all_score = accuracy_score(y_sy_test_top4, y_prob_rfc) * 100
        n_tree.append(i)
        d_depth.append(j)
        accuracy_score_arr.append(round(accuracy_all_score, 3))

df_result['N value'] = n_tree
df_result["Depth"] = d_depth
df_result["Accuracy Score"] = accuracy_score_arr
# print(df_result)
# Find the n and depth for maximum accuracy
accuracy_score_arr = df_result.max()
accuracy_rf = round(accuracy_score_arr['Accuracy Score'],2)
print("\n Random Forest Classifier - Maximum accuracy is obtained for:\n", accuracy_score_arr)
confusion_matrix = metrics.confusion_matrix(y_sy_test_top4, y_prob_rfc)
print("\n Confusion Matrix - Random Forest\n",confusion_matrix)
 Random Forest Classifier - Maximum accuracy is obtained for:
 N value           10.000
Depth              5.000
Accuracy Score    65.923
dtype: float64

 Confusion Matrix - Random Forest
 [[   0 1933    0]
 [   0 8001    0]
 [   0 2215    0]]
In [39]:
# Use Decision Tree Classifier
dtc = DecisionTreeClassifier(criterion="gini", random_state=100, max_depth=3, min_samples_leaf=5)
# Train the model
dtc.fit(x_sy_train_top4, y_sy_train_top4)
# Prediction
y_prob_dtc = dtc.predict(x_sy_test_top4)
# Accuracy calculation
accuracy_dt = accuracy_score(y_sy_test_top4, y_prob_dtc) * 100
print("\nAccuracy with Decision Tree classifier is : ", round(accuracy_dt, 2))
confusion_matrix = metrics.confusion_matrix(y_sy_test_top4, y_prob_dtc)
print("\nConfusion Matrix - Decision Tree classifier\n",confusion_matrix)
Accuracy with Decision Tree classifier is :  65.86

Confusion Matrix - Decision Tree classifier
 [[   0 1933    0]
 [   0 8001    0]
 [   0 2215    0]]
In [40]:
#-------------AdaBoost------------------
# For λ = 0.5 and λ = 1, constructed an Adaboost classifier with two base estimators of logistic regression
# and Naive Bayesian.
# Initialize base estimators
lr_clf = LogisticRegression()
nb_clf = GaussianNB()
classifiers = [lr_clf, nb_clf]
lambdas = [0.5, 1]
N = list(range(1, 16))
for lam in lambdas:

    for clf in classifiers:
        clf_name = type(clf).__name__
        error_rates = []
        for n in N:
            # Train Adaboost classifier with n weak learners and base estimator
            ada_clf = AdaBoostClassifier(base_estimator=clf, n_estimators=n, learning_rate=lam)
            ada_clf.fit(x_sy_train_top4, y_sy_train_top4)
            # Predict on year 2 data and calculate error rate
            y_pred_ada = ada_clf.predict(x_sy_test_top4)
            error_rate = 1 - accuracy_score(y_sy_test_top4, y_pred_ada)
            error_rates.append(error_rate)

        # Plot the error rates
        plt.plot(N, error_rates, label=clf_name)
        plt.xlabel('N ')
        plt.ylabel('Error rate')
        plt.title('Adaboost error rate for ' + clf_name + ' with λ= ' + str(lam))
        plt.legend()
        # plt.show()

# Best value of N* for each base estimator for lambda = 0.5
best_Ns = {}
best_accuracy = {}
for base_estimator in classifiers:
    error_rates = []
    accuracy_list = []
    # Iterate over the range of N values
    for n in N:
        ada_clf = AdaBoostClassifier(base_estimator=base_estimator, n_estimators=n, learning_rate=0.5)
        ada_clf.fit(x_sy_train_top4, y_sy_train_top4)
        y_pred_ada_clf = ada_clf.predict(x_sy_test_top4)
        accuracy_ada_clf = accuracy_score(y_sy_test_top4, y_pred_ada_clf)
        accuracy_list.append(accuracy_ada_clf)
        error_rate_ada_clf = round(1 - accuracy_ada_clf, 2)
        error_rates.append(error_rate_ada_clf)
    # Find the index of the minimum error rate
    best_N_idx = np.argmin(error_rates)
    best_Ns[str(base_estimator)] = N[best_N_idx]
    best_accuracy[str(base_estimator)] = accuracy_list[best_N_idx]
# Print the best N* for each base estimator
for estimator, best_N in best_Ns.items():
    estimator = estimator.replace("(", " ").replace(")", " ")
    # print('\nBest value of N* for',estimator,'is : ', best_N)
# print(best_accuracy)
accuracy_gnb_adab = round(best_accuracy['GaussianNB()'],2) * 100
accuracy_lr_adab = round(best_accuracy['LogisticRegression()'],2) * 100
# Accuracy for each base estimator
for estimator, best_acc in best_accuracy.items():
    estimator = estimator.replace("(", " ").replace(")", " ")
    print('\nAccuracy with AdaBoost classifier and ', estimator, 'is : ', round(best_acc,2))
Accuracy with AdaBoost classifier and  LogisticRegression   is :  0.66

Accuracy with AdaBoost classifier and  GaussianNB   is :  0.66
In [41]:
# Logistic regression model
lrm = LogisticRegression()
lrm.fit(x_sy_train_top4, y_sy_train_top4)
# Predict
y_pred_lr = lrm.predict(x_sy_test_top4)
# Logistic regression co-efficients
print("Logistic regression coefficients:")
print("b0 = ", lrm.intercept_)
print("b1 = ", lrm.coef_[0][0])
print("b2 = ", lrm.coef_[0][1])
# Accuracy and Confusion Matrix
accuracy_log_reg = round(accuracy_score(y_sy_test_top4, y_pred_lr),2) * 100
print("\nAccuracy with Logistic Regression classifier is :", accuracy_log_reg)
cm = metrics.confusion_matrix(y_sy_test_top4, y_pred_lr)
print("\nConfusion Matrix - Logistic Regression classifier : \n", cm)
Logistic regression coefficients:
b0 =  [-7.30865541e-06  1.24016122e-05 -5.09295677e-06]
b1 =  3.0808607640375015e-05
b2 =  -2.5135848094315654e-05

Accuracy with Logistic Regression classifier is : 66.0

Confusion Matrix - Logistic Regression classifier : 
 [[   0 1933    0]
 [  10 7991    0]
 [   1 2214    0]]
In [42]:
# Tabular demonstration of accuracy and classifiers
# Summarize the table
print('\nSummary Table with Top 4 features by importance using \nExtra Trees Classifier')
columns = ['Classifier', 'Accuracy %']
hdr = ['K-NN','SVM-Gaussian','SVM-Polynomial','SVM-Linear','Random Forest','Decision Tree',
       'AdaBoost with LogisticRegression','AdaBoost with Gaussian Naive Bayes','LogisticRegression']
c1 = [round(accuracy_knn,2),accuracy_g_svm,round(accuracy_p_svm_score,2),round(accuracy_lin_svm_score,2)
    ,accuracy_rf,round(accuracy_dt,2),accuracy_lr_adab,accuracy_gnb_adab,accuracy_log_reg]
class_Table = PrettyTable()
class_Table.add_column(columns[0], hdr)
class_Table.add_column(columns[1], c1)
print(class_Table)
Summary Table with Top 4 features by importance using 
Extra Trees Classifier
+------------------------------------+------------+
|             Classifier             | Accuracy % |
+------------------------------------+------------+
|                K-NN                |   63.03    |
|            SVM-Gaussian            |   65.86    |
|           SVM-Polynomial           |   65.86    |
|             SVM-Linear             |   46.33    |
|           Random Forest            |   65.92    |
|           Decision Tree            |   65.86    |
|  AdaBoost with LogisticRegression  |    66.0    |
| AdaBoost with Gaussian Naive Bayes |    66.0    |
|         LogisticRegression         |    66.0    |
+------------------------------------+------------+
In [ ]:
 
In [ ]: